{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 12.1.2 PCA主成分分析代码实现"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.二维空间降维Python代码实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 1],\n",
       "       [2, 2],\n",
       "       [3, 3]])"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "X = np.array([[1, 1], [2, 2], [3, 3]])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1\n",
       "0  1  1\n",
       "1  2  2\n",
       "2  3  3"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 也可以通过pandas库来构造数据，效果一样\n",
    "import pandas as pd\n",
    "X = pd.DataFrame([[1, 1], [2, 2], [3, 3]])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.41421356],\n",
       "       [ 0.        ],\n",
       "       [ 1.41421356]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据降维，由二维降至一维\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=1)\n",
    "pca.fit(X)  # 进行降维模型训练\n",
    "X_transformed = pca.transform(X)  # 进行数据降维，并赋值给X_transformed\n",
    "\n",
    "X_transformed  # 查看降维后的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 1)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看此时的维度\n",
    "X_transformed.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.70710678, 0.70710678]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看降维的系数\n",
    "pca.components_  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7071067811865476 * X + 0.7071067811865475 * Y\n"
     ]
    }
   ],
   "source": [
    "# 查看线性组合表达式\n",
    "a = pca.components_[0][0] \n",
    "b = pca.components_[0][1]\n",
    "print(str(a) + ' * X + ' +  str(b) + ' * Y')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这个的确也和12.1.1节我们获得的线性组合是一样的。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2.三维空间降维Python代码实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄(岁)</th>\n",
       "      <th>负债比率</th>\n",
       "      <th>月收入(元)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>45</td>\n",
       "      <td>0.80</td>\n",
       "      <td>9120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>40</td>\n",
       "      <td>0.12</td>\n",
       "      <td>2600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>0.09</td>\n",
       "      <td>3042</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>30</td>\n",
       "      <td>0.04</td>\n",
       "      <td>3300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>39</td>\n",
       "      <td>0.21</td>\n",
       "      <td>3500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   年龄(岁)  负债比率  月收入(元)\n",
       "0     45  0.80    9120\n",
       "1     40  0.12    2600\n",
       "2     38  0.09    3042\n",
       "3     30  0.04    3300\n",
       "4     39  0.21    3500"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "X = pd.DataFrame([[45, 0.8, 9120], [40, 0.12, 2600], [38, 0.09, 3042], [30, 0.04, 3300], [39, 0.21, 3500]], columns=['年龄(岁)', '负债比率', '月收入(元)'])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.36321743,  1.96044639,  1.98450514],\n",
       "       [ 0.33047695, -0.47222431, -0.70685302],\n",
       "       [-0.08261924, -0.57954802, -0.52440206],\n",
       "       [-1.73500401, -0.75842087, -0.41790353],\n",
       "       [ 0.12392886, -0.15025319, -0.33534653]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 因为三个指标数据的量级相差较大，所以可以先进行数据归一化处理\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "X_new = StandardScaler().fit_transform(X)\n",
    "\n",
    "X_new  # 查看归一化后的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3.08724247,  0.32991205],\n",
       "       [-0.52888635, -0.74272137],\n",
       "       [-0.70651782, -0.33057258],\n",
       "       [-1.62877292,  1.05218639],\n",
       "       [-0.22306538, -0.30880449]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(X_new)  # 进行降维模型训练\n",
    "X_transformed = pca.transform(X_new)  # 进行数据降维，并赋值给X_transformed\n",
    "\n",
    "X_transformed  # 查看降维后的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.52952108,  0.61328179,  0.58608264],\n",
       "       [-0.82760701,  0.22182579,  0.51561609]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看降维的系数\n",
    "pca.components_  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5295210839165538 * 年龄(岁) + 0.6132817922410683 * 负债比率 + 0.5860826434841948 * 月收入(元)\n",
      "-0.8276070105929828 * 年龄(岁) + 0.2218257919336094 * 负债比率 + 0.5156160917294705 * 月收入(元)\n"
     ]
    }
   ],
   "source": [
    "dim = ['年龄(岁)', '负债比率', '月收入(元)']\n",
    "for i in pca.components_:\n",
    "    formula = []\n",
    "    for j in range(len(i)):\n",
    "        formula.append(str(i[j]) + ' * ' + dim[j])\n",
    "    print(\" + \".join(formula))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5295210839165538 * X + 0.6132817922410683 * Y + 0.5860826434841948 * Z\n",
      "-0.8276070105929828 * X + 0.2218257919336094 * Y + 0.5156160917294705 * Z\n"
     ]
    }
   ],
   "source": [
    "# 如果不想显示具体的特征名称，可以采用如下的写法\n",
    "dim = ['X', 'Y', 'Z']\n",
    "for i in pca.components_:\n",
    "    formula = []\n",
    "    for j in range(len(i)):\n",
    "        formula.append(str(i[j]) + ' * ' + dim[j])\n",
    "    print(\" + \".join(formula))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
